In [1]:
# Data analysis and Manipulation
import plotly.graph_objs as go
import plotly.io as pio
import plotly.express as px
import pandas as pd
# Data Visualization
import matplotlib.pyplot as plt
# Importing Plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
# Initializing Plotly
pio.renderers.default = 'notebook'
In [2]:
# Importing Dataset1
dataset1 = pd.read_csv("covid.csv")
dataset1.head() # returns first 5 rows
Out[2]:
| Country/Region | Continent | Population | TotalCases | NewCases | TotalDeaths | NewDeaths | TotalRecovered | NewRecovered | ActiveCases | Serious,Critical | Tot Cases/1M pop | Deaths/1M pop | TotalTests | Tests/1M pop | WHO Region | iso_alpha | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | USA | North America | 3.311981e+08 | 5032179 | NaN | 162804.0 | NaN | 2576668.0 | NaN | 2292707.0 | 18296.0 | 15194.0 | 492.0 | 63139605.0 | 190640.0 | Americas | USA |
| 1 | Brazil | South America | 2.127107e+08 | 2917562 | NaN | 98644.0 | NaN | 2047660.0 | NaN | 771258.0 | 8318.0 | 13716.0 | 464.0 | 13206188.0 | 62085.0 | Americas | BRA |
| 2 | India | Asia | 1.381345e+09 | 2025409 | NaN | 41638.0 | NaN | 1377384.0 | NaN | 606387.0 | 8944.0 | 1466.0 | 30.0 | 22149351.0 | 16035.0 | South-EastAsia | IND |
| 3 | Russia | Europe | 1.459409e+08 | 871894 | NaN | 14606.0 | NaN | 676357.0 | NaN | 180931.0 | 2300.0 | 5974.0 | 100.0 | 29716907.0 | 203623.0 | Europe | RUS |
| 4 | South Africa | Africa | 5.938157e+07 | 538184 | NaN | 9604.0 | NaN | 387316.0 | NaN | 141264.0 | 539.0 | 9063.0 | 162.0 | 3149807.0 | 53044.0 | Africa | ZAF |
In [3]:
# Returns tuple of shape (Rows, columns)
print(dataset1.shape)
# Returns size of dataframe
print(dataset1.size)
(209, 17) 3553
In [4]:
# Information about Dataset1
# return concise summary of dataframe
dataset1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 209 entries, 0 to 208 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country/Region 209 non-null object 1 Continent 208 non-null object 2 Population 208 non-null float64 3 TotalCases 209 non-null int64 4 NewCases 4 non-null float64 5 TotalDeaths 188 non-null float64 6 NewDeaths 3 non-null float64 7 TotalRecovered 205 non-null float64 8 NewRecovered 3 non-null float64 9 ActiveCases 205 non-null float64 10 Serious,Critical 122 non-null float64 11 Tot Cases/1M pop 208 non-null float64 12 Deaths/1M pop 187 non-null float64 13 TotalTests 191 non-null float64 14 Tests/1M pop 191 non-null float64 15 WHO Region 184 non-null object 16 iso_alpha 209 non-null object dtypes: float64(12), int64(1), object(4) memory usage: 27.9+ KB
In [5]:
# Importing Dataset2
dataset2 = pd.read_csv("covid_grouped.csv")
dataset2.head() # return first 5 rows of dataset2
Out[5]:
| Date | Country/Region | Confirmed | Deaths | Recovered | Active | New cases | New deaths | New recovered | WHO Region | iso_alpha | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-01-22 | Afghanistan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Eastern Mediterranean | AFG |
| 1 | 2020-01-22 | Albania | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Europe | ALB |
| 2 | 2020-01-22 | Algeria | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Africa | DZA |
| 3 | 2020-01-22 | Andorra | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Europe | AND |
| 4 | 2020-01-22 | Angola | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Africa | AGO |
In [6]:
# Returns tuple of shape (Rows, columns)
print(dataset2.shape)
# Returns size of dataframe
print(dataset2.size)
(35156, 11) 386716
In [7]:
# Information about Dataset2
dataset2.info() # return concise summary of dataframe
<class 'pandas.core.frame.DataFrame'> RangeIndex: 35156 entries, 0 to 35155 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 35156 non-null object 1 Country/Region 35156 non-null object 2 Confirmed 35156 non-null int64 3 Deaths 35156 non-null int64 4 Recovered 35156 non-null int64 5 Active 35156 non-null int64 6 New cases 35156 non-null int64 7 New deaths 35156 non-null int64 8 New recovered 35156 non-null int64 9 WHO Region 35156 non-null object 10 iso_alpha 35156 non-null object dtypes: int64(7), object(4) memory usage: 3.0+ MB
In [8]:
# Columns labels of a Dataset1
dataset1.columns
Out[8]:
Index(['Country/Region', 'Continent', 'Population', 'TotalCases', 'NewCases',
'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered',
'ActiveCases', 'Serious,Critical', 'Tot Cases/1M pop', 'Deaths/1M pop',
'TotalTests', 'Tests/1M pop', 'WHO Region', 'iso_alpha'],
dtype='object')
In [9]:
# Drop NewCases, NewDeaths, NewRecovered rows from dataset1
dataset1.drop(['NewCases', 'NewDeaths', 'NewRecovered'],
axis=1, inplace=True)
# Select random set of values from dataset1
dataset1.sample(5)
Out[9]:
| Country/Region | Continent | Population | TotalCases | TotalDeaths | TotalRecovered | ActiveCases | Serious,Critical | Tot Cases/1M pop | Deaths/1M pop | TotalTests | Tests/1M pop | WHO Region | iso_alpha | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 28 | Bolivia | South America | 11688459.0 | 86423 | 3465.0 | 27373.0 | 55585.0 | 71.0 | 7394.0 | 296.0 | 183583.0 | 15706.0 | Americas | BOL |
| 35 | Belgium | Europe | 11594739.0 | 71158 | 9859.0 | 17661.0 | 43638.0 | 61.0 | 6137.0 | 850.0 | 1767120.0 | 152407.0 | Europe | BEL |
| 80 | Senegal | Africa | 16783877.0 | 10715 | 223.0 | 7101.0 | 3391.0 | 33.0 | 638.0 | 13.0 | 114761.0 | 6838.0 | Africa | SEN |
| 163 | Comoros | Africa | 871326.0 | 396 | 7.0 | 340.0 | 49.0 | NaN | 454.0 | 8.0 | NaN | NaN | Africa | COM |
| 54 | Algeria | Africa | 43926079.0 | 33626 | 1273.0 | 23238.0 | 9115.0 | 57.0 | 766.0 | 29.0 | NaN | NaN | Africa | DZA |
In [10]:
# Import create_table Figure Factory
from plotly.figure_factory import create_table
colorscale = [[0, '#4d004c'], [.5, '#f2e5ff'], [1, '#ffffff']]
table = create_table(dataset1.head(15), colorscale=colorscale)
py.iplot(table)
In [11]:
px.bar(dataset1.head(15), x = 'Country/Region',
y = 'TotalCases',color = 'TotalCases',
height = 500,hover_data = ['Country/Region', 'Continent'])
In [12]:
px.bar(dataset1.head(15), x = 'Country/Region', y = 'TotalCases',
color = 'TotalDeaths', height = 500,
hover_data = ['Country/Region', 'Continent'])
In [13]:
px.bar(dataset1.head(15), x = 'Country/Region', y = 'TotalCases',
color = 'TotalDeaths', height = 500,
hover_data = ['Country/Region', 'Continent'])
In [14]:
px.bar(dataset1.head(15), x = 'Country/Region', y = 'TotalCases',
color = 'TotalTests', height = 500, hover_data = ['Country/Region', 'Continent'])
In [15]:
px.bar(dataset1.head(15), x = 'TotalTests', y = 'Country/Region',
color = 'TotalTests',orientation ='h', height = 500,
hover_data = ['Country/Region', 'Continent'])
In [16]:
px.bar(dataset1.head(15), x = 'TotalTests', y = 'Continent',
color = 'TotalTests',orientation ='h', height = 500,
hover_data = ['Country/Region', 'Continent'])
In [17]:
px.scatter(dataset1, x='Continent',y='TotalCases',
hover_data=['Country/Region', 'Continent'],
color='TotalCases', size='TotalCases', size_max=80)
In [18]:
px.scatter(dataset1.head(54), x='Continent',y='TotalTests',
hover_data=['Country/Region', 'Continent'],
color='TotalTests', size='TotalTests', size_max=80)
In [19]:
px.scatter(dataset1.head(50), x='Continent',y='TotalTests',
hover_data=['Country/Region', 'Continent'],
color='TotalTests', size='TotalTests', size_max=80, log_y=True)
In [20]:
px.scatter(dataset1.head(100), x='Country/Region', y='TotalCases',
hover_data=['Country/Region', 'Continent'],
color='TotalCases', size='TotalCases', size_max=80)
In [21]:
px.scatter(dataset1.head(30), x='Country/Region', y='TotalCases',
hover_data=['Country/Region', 'Continent'],
color='Country/Region', size='TotalCases', size_max=80, log_y=True)
In [22]:
px.scatter(dataset1.head(10), x='Country/Region', y= 'TotalDeaths',
hover_data=['Country/Region', 'Continent'],
color='Country/Region', size= 'TotalDeaths', size_max=80)
In [23]:
px.scatter(dataset1.head(30), x='Country/Region', y= 'Tests/1M pop',
hover_data=['Country/Region', 'Continent'],
color='Country/Region', size= 'Tests/1M pop', size_max=80)
In [24]:
px.scatter(dataset1.head(30), x='Country/Region', y= 'Tests/1M pop',
hover_data=['Country/Region', 'Continent'],
color='Tests/1M pop', size= 'Tests/1M pop', size_max=80)
In [25]:
px.scatter(dataset1.head(30), x='TotalCases', y= 'TotalDeaths',
hover_data=['Country/Region', 'Continent'],
color='TotalDeaths', size= 'TotalDeaths', size_max=80)
In [26]:
px.scatter(dataset1.head(30), x='TotalCases', y= 'TotalDeaths',
hover_data=['Country/Region', 'Continent'],
color='TotalDeaths', size= 'TotalDeaths', size_max=80,
log_x=True, log_y=True)
In [27]:
px.scatter(dataset1.head(30), x='TotalTests', y= 'TotalCases',
hover_data=['Country/Region', 'Continent'],
color='TotalTests', size= 'TotalTests', size_max=80,
log_x=True, log_y=True)
In [ ]:
px.bar(dataset2, x="Date", y="Confirmed", color="Confirmed",
hover_data=["Confirmed", "Date", "Country/Region"], height=400)
In [ ]:
px.bar(dataset2, x="Date", y="Confirmed", color="Confirmed",
hover_data=["Confirmed", "Date", "Country/Region"],log_y=True, height=400)
In [ ]:
px.bar(dataset2, x="Date", y="Deaths", color="Deaths",
hover_data=["Confirmed", "Date", "Country/Region"],
log_y=False, height=400)
In [ ]:
df_US= dataset2.loc[dataset2["Country/Region"]=="US"]
In [90]:
px.bar(df_US, x="Date", y="Confirmed", color="Confirmed", height=400)
In [91]:
px.bar(df_US,x="Date", y="Recovered", color="Recovered", height=400)
In [92]:
px.line(df_US,x="Date", y="Recovered", height=400)
In [93]:
px.line(df_US,x="Date", y="Deaths", height=400)
In [94]:
px.line(df_US,x="Date", y="Confirmed", height=400)
In [95]:
px.line(df_US,x="Date", y="New cases", height=400)
In [96]:
px.bar(df_US,x="Date", y="New cases", height=400)
In [97]:
px.scatter(df_US, x="Confirmed", y="Deaths", height=400)
In [98]:
px.choropleth(dataset2,
locations="iso_alpha",
color="Confirmed",
hover_name="Country/Region",
color_continuous_scale="Blues",
animation_frame="Date")
In [99]:
px.choropleth(dataset2,
locations='iso_alpha',
color="Deaths",
hover_name="Country/Region",
color_continuous_scale="Viridis",
animation_frame="Date" )
In [100]:
px.choropleth(dataset2,
locations='iso_alpha',
color="Recovered",
hover_name="Country/Region",
color_continuous_scale="RdYlGn",
projection="natural earth",
animation_frame="Date" )
In [101]:
px.bar(dataset2, x="WHO Region", y="Confirmed", color="WHO Region",
animation_frame="Date", hover_name="Country/Region")
In [102]:
dataset3= pd.read_csv("coviddeath.csv")
dataset3.head()
Out[102]:
| Data as of | Start Week | End Week | State | Condition Group | Condition | ICD10_codes | Age Group | Number of COVID-19 Deaths | Flag | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 08/30/2020 | 02/01/2020 | 08/29/2020 | US | Respiratory diseases | Influenza and pneumonia | J09-J18 | 0-24 | 122.0 | NaN |
| 1 | 08/30/2020 | 02/01/2020 | 08/29/2020 | US | Respiratory diseases | Influenza and pneumonia | J09-J18 | 25-34 | 596.0 | NaN |
| 2 | 08/30/2020 | 02/01/2020 | 08/29/2020 | US | Respiratory diseases | Influenza and pneumonia | J09-J18 | 35-44 | 1521.0 | NaN |
| 3 | 08/30/2020 | 02/01/2020 | 08/29/2020 | US | Respiratory diseases | Influenza and pneumonia | J09-J18 | 45-54 | 4186.0 | NaN |
| 4 | 08/30/2020 | 02/01/2020 | 08/29/2020 | US | Respiratory diseases | Influenza and pneumonia | J09-J18 | 55-64 | 10014.0 | NaN |
In [103]:
dataset3.tail()
Out[103]:
| Data as of | Start Week | End Week | State | Condition Group | Condition | ICD10_codes | Age Group | Number of COVID-19 Deaths | Flag | |
|---|---|---|---|---|---|---|---|---|---|---|
| 12255 | 08/30/2020 | 02/01/2020 | 08/29/2020 | YC | Coronavirus Disease 2019 | COVID-19 | U071 | 65-74 | 5024.0 | NaN |
| 12256 | 08/30/2020 | 02/01/2020 | 08/29/2020 | YC | Coronavirus Disease 2019 | COVID-19 | U071 | 75-84 | 5381.0 | NaN |
| 12257 | 08/30/2020 | 02/01/2020 | 08/29/2020 | YC | Coronavirus Disease 2019 | COVID-19 | U071 | 85+ | 4841.0 | NaN |
| 12258 | 08/30/2020 | 02/01/2020 | 08/29/2020 | YC | Coronavirus Disease 2019 | COVID-19 | U071 | Not stated | NaN | Counts less than 10 suppressed. |
| 12259 | 08/30/2020 | 02/01/2020 | 08/29/2020 | YC | Coronavirus Disease 2019 | COVID-19 | U071 | All ages | 20628.0 | NaN |
In [104]:
dataset3.groupby(["Condition"]).count()
Out[104]:
| Data as of | Start Week | End Week | State | Condition Group | ICD10_codes | Age Group | Number of COVID-19 Deaths | Flag | |
|---|---|---|---|---|---|---|---|---|---|
| Condition | |||||||||
| Adult respiratory distress syndrome | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 272 | 268 |
| All other conditions and causes (residual) | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 363 | 177 |
| Alzheimer disease | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 144 | 386 |
| COVID-19 | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 377 | 163 |
| Cardiac arrest | 520 | 520 | 520 | 520 | 520 | 520 | 520 | 219 | 301 |
| Cardiac arrhythmia | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 192 | 348 |
| Cerebrovascular diseases | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 187 | 343 |
| Chronic lower respiratory diseases | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 229 | 311 |
| Diabetes | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 276 | 264 |
| Heart failure | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 204 | 336 |
| Hypertensive diseases | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 264 | 276 |
| Influenza and pneumonia | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 331 | 209 |
| Intentional and unintentional injury, poisoning, and other adverse events | 520 | 520 | 520 | 520 | 520 | 520 | 520 | 188 | 332 |
| Ischemic heart disease | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 224 | 316 |
| Malignant neoplasms | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 198 | 342 |
| Obesity | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 182 | 348 |
| Other diseases of the circulatory system | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 213 | 317 |
| Other diseases of the respiratory system | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 188 | 352 |
| Renal failure | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 238 | 302 |
| Respiratory arrest | 480 | 480 | 480 | 480 | 480 | 480 | 480 | 111 | 369 |
| Respiratory failure | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 320 | 220 |
| Sepsis | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 243 | 287 |
| Vascular and unspecified dementia | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 191 | 339 |
In [105]:
# import word cloud
from wordcloud import WordCloud
sentences = dataset3["Condition"].tolist()
sentences_as_a_string = ' '.join(sentences)
# Convert the string into WordCloud
plt.figure(figsize=(20, 20))
plt.imshow(WordCloud().generate(sentences_as_a_string))
Out[105]:
<matplotlib.image.AxesImage at 0x16cb770dc60>
In [106]:
column2_tolist= dataset3["Condition Group"].tolist()
# Convert the list to one single string
column_to_string= " ".join(column2_tolist)
# Convert the string into WordCloud
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(column_to_string))
Out[106]:
<matplotlib.image.AxesImage at 0x16cb7696500>
In [ ]:
Thank You! Complete Project on Github https://github.com/Vikas-Yadav-6696/COVID-19-PYTHON-DATA-ANALYSIS
In [ ]: